Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
tmp <- fs::dir_map("R scripts/", source)
theme_set(theme_light())library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
tmp <- fs::dir_map("R scripts/", source)
theme_set(theme_light())data_source <- "real"
qpcr_dir <- str_c(get_data_dir(data_source), "00 Raw/03 qPCR/")
file <- "VIBRANT_qPCR_data_merged_250416.csv"
cat("We load the file", file, "\n\n")We load the file VIBRANT_qPCR_data_merged_250416.csv
qpcr_LBP <- read_csv(str_c(qpcr_dir, file))
file <- "VIBRANT_16SqPCR_20250514.csv"
cat("And the file ", file, "\n\n")And the file VIBRANT_16SqPCR_20250514.csv
qpcr_16S <- read_csv(str_c(qpcr_dir, file))
rm(qpcr_dir, file)The data is provided in long format:
qpcr_LBP |> glimpse()Rows: 51,480
Columns: 20
$ Data_row <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ Well <chr> "A02", "A03", "A04", "A05", "A06", "A07", "A0…
$ Fluor <chr> "Cy5", "Cy5", "Cy5", "Cy5", "Cy5", "Cy5", "Cy…
$ Content <chr> "Unkn-01", "Unkn-01", "Unkn-01", "Unkn-02", "…
$ Replicate <dbl> 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, …
$ Sample <chr> "A1", "A1", "A1", "A2", "A2", "A2", "A3", "A3…
$ Cq <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, …
$ `Starting Quantity (SQ)` <dbl> 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+…
$ `Cq Mean` <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.000…
$ plate_id <chr> "B064152", "B064152", "B064152", "B064152", "…
$ Plate_number <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ gDNA_Plate_ID <dbl> 2240890, 2240890, 2240890, 2240890, 2240890, …
$ VMRC_Group <chr> "VMRC_3", "VMRC_3", "VMRC_3", "VMRC_3", "VMRC…
$ Group_Fluor <chr> "VMRC_3_Cy5", "VMRC_3_Cy5", "VMRC_3_Cy5", "VM…
$ Target <chr> "185329", "185329", "185329", "185329", "1853…
$ gDNA_Plate_Well <chr> "2240890_A1", "2240890_A1", "2240890_A1", "22…
$ VIBR_Sample_ID <chr> "2152456", "2152456", "2152456", "2157507", "…
$ Dilution_Factor <dbl> 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3…
$ Quant_Adjusted <dbl> 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+…
$ Copies_per_swab <dbl> 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, …
Since the two files have exactly the same columns, we merge them into a single file after some harmonization.
First, we notice that there is only one clinical sample that has data for the LBP qPCR, but not for the 16S qPCR.
full_join(
qpcr_16S |> select(VIBR_Sample_ID, gDNA_Plate_ID) |> mutate(has_16S = TRUE) |> distinct(),
qpcr_LBP |> select(VIBR_Sample_ID, gDNA_Plate_ID) |> mutate(has_LBP = TRUE) |> distinct()
) |> filter(is.na(has_16S) | is.na(has_LBP)) |>
filter(!is.na(VIBR_Sample_ID), !(VIBR_Sample_ID %in% c("#N/A", "empty"))) # A tibble: 1 × 4
VIBR_Sample_ID gDNA_Plate_ID has_16S has_LBP
<chr> <dbl> <lgl> <lgl>
1 2171048 2240890 NA TRUE
We also note that there is one sample on the same plate that has 6 replicates instead of 3 in the 16S qPCR.
qpcr_16S |> filter(!is.na(VIBR_Sample_ID)) |> dplyr::count(VIBR_Sample_ID, gDNA_Plate_ID ) |> filter(n > 3)# A tibble: 1 × 3
VIBR_Sample_ID gDNA_Plate_ID n
<chr> <dbl> <int>
1 2167612 2240890 6
@Michael: do you think this could be a mislabeling?
# We rename the standards in the 16S qPCR so they are numbered
std_16s <-
qpcr_16S |>
filter(str_detect(Content, "Std")) |>
select(Content, `Starting Quantity (SQ)`) |>
distinct() |>
arrange(`Starting Quantity (SQ)` |> desc()) |>
mutate(std_nb = row_number())
qpcr_16S <-
qpcr_16S |>
left_join(std_16s, by = join_by(Content, `Starting Quantity (SQ)`)) |>
mutate(
Content = Content |> str_c("-", std_nb |> str_pad(width = 2, pad = "0")),
Target = "16S"
) |>
select(-std_nb)
# We transform the VIBR_Sample_ID column to be consistent with the 16S qPCR
qpcr_LBP <-
qpcr_LBP |>
mutate(
VIBR_Sample_ID = ifelse(VIBR_Sample_ID == "#N/A", NA_character_, VIBR_Sample_ID)
)
qpcr <-
bind_rows(
qpcr_LBP |> mutate(target_type = "LBP"),
qpcr_16S |> mutate(target_type = "16S rRNA")
) |>
relocate(target_type, .after = Target)dictionary <-
tibble(original_name = colnames(qpcr)) |>
mutate(
description =
case_when(
(original_name == "Data_row") ~ 'Table row number',
(original_name == "Well") ~ str_c("qPCR plate well ID. From ", min(qpcr$Well), " to ", max(qpcr$Well)),
(original_name == "Fluor") ~ str_c('fluorescent dye used (one of ', qpcr$Fluor |> unique() |> str_c(collapse = ", "), ")"),
(original_name == "Content") ~ 'Indicates whether well contains a sample, a standard, or a negative control.',
(original_name == "Replicate") ~
str_c(
"The replicate number from ", min(qpcr$Replicate, na.rm = TRUE), " to ", max(qpcr$Replicate, na.rm = TRUE),
" (with ", sum(is.na(qpcr$Replicate)), " missing values). Values are repeated 495, or 990 times."
),
(original_name == "Sample") ~ 'A qPCR plate-specific sample ID. This column wont be used or kept for downstream analyses.',
(original_name == "Cq") ~ 'Quantification cycle (Cq) value.',
(original_name == "Starting Quantity (SQ)") ~ 'Starting quantity (SQ) value; computed from the Cq values using the inverse calibration function (calibration curve fitted using the expected SQ values for the standards).',
(original_name == "Cq Mean") ~ 'Mean across replicates',
(original_name == "plate_id") ~ str_c('qPCR plate ID; there are ', qpcr$plate_id |> unique() |> length()," unique plate IDs in total."),
(original_name == "Plate_number") ~ str_c("The extraction plate number; there are ", qpcr$Plate_number |> unique() |> length()," unique plate numbers in total (5 identical `plate_id` per `Plate_number`)."),
(original_name == "gDNA_Plate_ID") ~ 'The extraction plate ID',
(original_name == "VMRC_Group") ~ str_c('The group of the LBP strains. There are ', qpcr$VMRC_Group |> unique() |> na.omit() |> length()," unique VMRC groups in total (", qpcr$VMRC_Group |> unique() |> na.omit() |> sort() |> str_c(collapse = ", "), ")"),
(original_name == "Group_Fluor") ~ 'Concatenation of the `VMRC_Group` and `Fluor` columns',
(original_name == "Target") ~ str_c('The target "gene" (here taxa/strain). There are ', qpcr$Target |> unique() |> length(), " unique targets in total (", qpcr$Target |> unique() |> str_c(collapse = ", "),")"),
(original_name == "target_type") ~ "Whether the target is a LBP strain or the 16S rRNA gene target",
(original_name == "gDNA_Plate_Well") ~ str_c('The concatenation of the VIBRANT swab barcode and the gDNA plate well ID; there are ', qpcr$gDNA_Plate_Well |> unique() |> length()," unique gDNA plate wells in total in the format `[gDNA_Plate_ID]_[Well]` (e.g., ", qpcr$gDNA_Plate_Well[10],")"),
(original_name == "VIBR_Sample_ID") ~ str_c("VIBRANT sample ID; there are ", qpcr$VIBR_Sample_ID |> unique() |> length(), " unique VIBRANT sample IDs in total)"),
(original_name == "Dilution_Factor") ~ str_c('The qPCR plate specific dilution factor. There are ', qpcr$Dilution_Factor |> unique() |> length()," unique dilution factors in total (", qpcr$Dilution_Factor |> unique() |> sort() |> str_c(collapse = ", "),")"),
(original_name == "Quant_Adjusted") ~ str_c("Quantification value adjusted by the dilution factor; range from ", min(qpcr$Quant_Adjusted, na.rm = TRUE)," to ", max(qpcr$Quant_Adjusted, na.rm = TRUE)),
(original_name == "Copies_per_swab") ~ str_c("The number of target copies per swab, computed from the adjusted quantification values. Ranges from ", min(qpcr$Copies_per_swab, na.rm = TRUE)," to ", max(qpcr$Copies_per_swab, na.rm = TRUE)),
TRUE ~ "????"
)
)# dictionary |> gt()We tidy the column names for consistency and readability.
qpcr <- qpcr |> janitor::clean_names()
qpcr <- qpcr |>
dplyr::rename(
strain_group_qpcr = vmrc_group,
starting_quantity = starting_quantity_sq,
pcr_plate_id = plate_id,
ext_lib_plate_nb = plate_number,
ext_lib_plate_id = g_dna_plate_id,
ext_lib_position = g_dna_plate_well
)dictionary <-
dictionary |>
mutate(name = colnames(qpcr)) |>
select(name, everything())The new column names are:
dictionary |>
gt() |>
cols_label(
name = "New column name",
original_name = "Original column name",
description = "Description"
) | New column name | Original column name | Description |
|---|---|---|
| data_row | Data_row | Table row number |
| well | Well | qPCR plate well ID. From A01 to P24 |
| fluor | Fluor | fluorescent dye used (one of Cy5, FAM, HEX, SYBR) |
| content | Content | Indicates whether well contains a sample, a standard, or a negative control. |
| replicate | Replicate | The replicate number from 1 to 96 (with 3927 missing values). Values are repeated 495, or 990 times. |
| sample | Sample | A qPCR plate-specific sample ID. This column wont be used or kept for downstream analyses. |
| cq | Cq | Quantification cycle (Cq) value. |
| starting_quantity | Starting Quantity (SQ) | Starting quantity (SQ) value; computed from the Cq values using the inverse calibration function (calibration curve fitted using the expected SQ values for the standards). |
| cq_mean | Cq Mean | Mean across replicates |
| pcr_plate_id | plate_id | qPCR plate ID; there are 66 unique plate IDs in total. |
| ext_lib_plate_nb | Plate_number | The extraction plate number; there are 11 unique plate numbers in total (5 identical `plate_id` per `Plate_number`). |
| ext_lib_plate_id | gDNA_Plate_ID | The extraction plate ID |
| strain_group_qpcr | VMRC_Group | The group of the LBP strains. There are 5 unique VMRC groups in total (VMRC_1, VMRC_2, VMRC_3, VMRC_4, VMRC_5) |
| group_fluor | Group_Fluor | Concatenation of the `VMRC_Group` and `Fluor` columns |
| target | Target | The target "gene" (here taxa/strain). There are 16 unique targets in total (185329, C0028A1, C0112A1, FF00004, C0006A1, 122010, UC101, FF00018, FF00051, UC119, FF00064, FF00072, C0175A1, C0022A1, C0059E1, 16S) |
| target_type | target_type | Whether the target is a LBP strain or the 16S rRNA gene target |
| ext_lib_position | gDNA_Plate_Well | The concatenation of the VIBRANT swab barcode and the gDNA plate well ID; there are 1155 unique gDNA plate wells in total in the format `[gDNA_Plate_ID]_[Well]` (e.g., 2240890_A4) |
| vibr_sample_id | VIBR_Sample_ID | VIBRANT sample ID; there are 1016 unique VIBRANT sample IDs in total) |
| dilution_factor | Dilution_Factor | The qPCR plate specific dilution factor. There are 4 unique dilution factors in total (5, 10, 20, 30) |
| quant_adjusted | Quant_Adjusted | Quantification value adjusted by the dilution factor; range from 0 to 3e+08 |
| copies_per_swab | Copies_per_swab | The number of target copies per swab, computed from the adjusted quantification values. Ranges from 0 to 7.5e+10 |
We define a qpcr_sample_type variable to summarize information from vibr_sample_id and sample columns.
qpcr <-
qpcr |>
mutate(
qpcr_sample_type =
case_when(
is.na(vibr_sample_id) & str_detect(content, "Std") ~ "Standard",
is.na(vibr_sample_id) & str_detect(sample, "water") ~ "Water",
str_detect(vibr_sample_id, "EQ") ~ "Control sample",
str_detect(vibr_sample_id, "empty") ~ "Empty",
is.na(vibr_sample_id) ~ "Water or empty",
TRUE ~ "VIBRANT clinical sample"
)
)# qpcr |> filter(target == "16S") |> arrange(qpcr_sample_type) |> View()qpcr |> dplyr::count(qpcr_sample_type, name = "n_wells") |> arrange(-n_wells) |> gt()| qpcr_sample_type | n_wells |
|---|---|
| VIBRANT clinical sample | 46560 |
| Standard | 3729 |
| Control sample | 2112 |
| Empty | 1890 |
| Water | 495 |
| Water or empty | 126 |
tmp <-
qpcr |>
filter(qpcr_sample_type == "VIBRANT clinical sample", target == target[1]) |>
group_by(vibr_sample_id) |>
summarize(
n_replicates = n(),
n_plates = ext_lib_plate_nb |> unique() |> length()
)
# tmp |> dplyr::count(n_plates) # All samples's replicates are on a single plate
# tmp |> dplyr::count(n_replicates) # All samples have 3 replicatesWe note that all samples’s replicates are on a single extraction plate (ext_lib_plate_nb or ext_lib_plate_id); and all samples have the same number of replicates (3).
tmp <-
qpcr |>
filter(qpcr_sample_type == "VIBRANT clinical sample", target == "16S") |>
group_by(vibr_sample_id) |>
summarize(
n_replicates = n(),
n_plates = ext_lib_plate_nb |> unique() |> length()
)
# tmp |> dplyr::count(n_plates) # All samples's replicates are on a single plate
# tmp |> dplyr::count(n_replicates) # All samples have 3 replicatesAs noted above, in the 16S qPCR, there is one sample that has 6 replicates:
tmp |>
filter(n_replicates == 6) |>
left_join(qpcr |> filter(target == "16S"), by = join_by(vibr_sample_id)) |>
select(vibr_sample_id, ext_lib_plate_nb, ext_lib_position, pcr_plate_id, data_row, well, cq) |>
gt()| vibr_sample_id | ext_lib_plate_nb | ext_lib_position | pcr_plate_id | data_row | well | cq |
|---|---|---|---|---|---|---|
| 2167612 | 1 | 2240890_E4 | b062862 | 163 | I08 | 22.40223 |
| 2167612 | 1 | 2240890_E4 | b062862 | 178 | J07 | 22.42773 |
| 2167612 | 1 | 2240890_E4 | b062862 | 179 | J08 | 22.37871 |
| 2167612 | 1 | 2240890_G5 | b062862 | 242 | M10 | 22.61241 |
| 2167612 | 1 | 2240890_G5 | b062862 | 258 | N09 | 22.52760 |
| 2167612 | 1 | 2240890_G5 | b062862 | 259 | N10 | 22.76733 |
qpcr <-
qpcr |>
mutate(
well_col = well |> str_sub(1, 1),
well_row = well |> str_sub(2,3) |> as.integer()
) |>
relocate(well_col, well_row, .after = well)We also define a unique “qPCR sample ID” according to each sample type.
qpcr <-
qpcr |>
mutate(
qpcr_sample_id =
case_when(
(qpcr_sample_type == "VIBRANT clinical sample") ~ vibr_sample_id,
(qpcr_sample_type == "Control sample") ~ vibr_sample_id,
(qpcr_sample_type == "Empty") ~
str_c(vibr_sample_id,"_plate_",
ext_lib_plate_nb |> str_pad(width = 2, pad = 0),"_", sample),
(qpcr_sample_type == "Water") ~
str_c("water_plate",
ext_lib_plate_nb |> str_pad(width = 2, pad = 0)),
(qpcr_sample_type == "Water or empty") ~
str_c("water_or_empty_plate",
ext_lib_plate_nb |> str_pad(width = 2, pad = 0),"_", sample),
(qpcr_sample_type == "Standard") ~
str_c("std_", content |> str_remove("Std-"),
"_plate_", ext_lib_plate_nb |> str_pad(width = 2, pad = 0)),
TRUE ~ NA_character_
)
) |>
group_by(qpcr_sample_id, target) |>
arrange(well_col, well_row) |>
mutate(replicate_nb = row_number()) |>
ungroup() |>
mutate(qpcr_uid = str_c(qpcr_sample_id, "_r", replicate_nb))# qpcr |> filter(replicate_nb > 3) |> View()For the LBP strains, the plate layout is as follows:
qpcr |>
filter(target != "16S") |>
select(ext_lib_plate_nb, well_col, well_row, qpcr_sample_type, sample) |>
distinct() |>
ggplot() +
aes(x = well_row |> factor(), y = well_col |> factor() |> fct_rev(), fill = qpcr_sample_type) +
geom_tile() +
geom_path(aes(group = sample), col = "black", alpha = 0.5) +
geom_point() +
facet_wrap(~ ext_lib_plate_nb) +
xlab("Well column") + ylab("Well row") +
labs(caption = "Black lines connect replicates") For the 16S plates, the layout is a little different:
qpcr |>
filter(target == "16S") |>
select(ext_lib_plate_nb, pcr_plate_id, well_col, well_row, qpcr_sample_type, qpcr_sample_id) |>
distinct() |>
ggplot() +
aes(x = well_row |> factor(), y = well_col |> factor() |> fct_rev(), fill = qpcr_sample_type, col = qpcr_sample_type) +
geom_tile(alpha = 0.25) +
geom_path(aes(group = qpcr_sample_id), alpha = 0.5, linewidth = 2) +
geom_point() +
facet_wrap(~ ext_lib_plate_nb + pcr_plate_id) +
xlab("Well column") + ylab("Well row") +
labs(caption = "Lines connect replicates") We merge the qPCR data with the metagenomics manifest data to add the participant ID (pid) and (visit_code).
SE_mg <-
readRDS(
list.files(
path = get_output_dir(data_source = data_source),
pattern = "02_se_mg_.*\\.rds$", full.names = TRUE
) |>
sort(decreasing = TRUE) |>
extract(1)
)qpcr <-
qpcr |>
left_join(
SE_mg |>
colData() |> as.data.frame() |> as_tibble() |>
select(
swab_barcode,
uid, mg_sample_id,
mg_pid, pid, visit_code, location,
sample_type, control_type, sample_category
) |>
distinct() ,
by = join_by("qpcr_sample_id" == "swab_barcode")
)samples_without_entries_in_mg_manifest <-
qpcr |>
filter(qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")) |>
filter(is.na(uid)) |>
select(qpcr_sample_id, qpcr_sample_type) |>
distinct() |>
arrange(qpcr_sample_type, qpcr_sample_id) There are 48 clinical or vibrant positive/negative control samples without a matching entry in the metagenomics manifest.
samples_without_entries_in_mg_manifest |>
dplyr::count(qpcr_sample_type) |>
gt()| qpcr_sample_type | n |
|---|---|
| Control sample | 26 |
| VIBRANT clinical sample | 22 |
TODO: export for Michael?
For these samples, the current uid is missing. We change it to the qpcr_sample_id when NA.
qpcr <-
qpcr |>
mutate(
uid = case_when(
is.na(uid) &
(qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")) ~
qpcr_sample_id,
TRUE ~ uid
)
)mg_samples <-
SE_mg |>
colData() |> as.data.frame() |> as_tibble() |>
select(
swab_barcode,
uid, mg_sample_id,
mg_pid, pid, visit_code,
sample_type, control_type, sample_category
) |>
distinct() |>
left_join(
qpcr |>
select(qpcr_sample_id, qpcr_sample_type) |>
distinct() |>
filter(qpcr_sample_type %in% c("VIBRANT clinical sample", "Control sample")),
by = join_by("swab_barcode" == "qpcr_sample_id")
)
# mg_samples |> filter(is.na(qpcr_sample_type)) |> nrow()There are 0 samples for which we have metagenomics data but no qPCR data.
rm(mg_samples, samples_without_entries_in_mg_manifest, tmp)The plate x Fluorescence x Target layout is as follows:
qpcr |>
dplyr::count(fluor, target, strain_group_qpcr, pcr_plate_id, ext_lib_plate_nb) |>
arrange(ext_lib_plate_nb, strain_group_qpcr, pcr_plate_id, fluor) |>
mutate(
target = target |> fct_inorder(),
plate_nb = str_c("ext. plate: ", ext_lib_plate_nb) |> fct_inorder(),
pcr_plate_id = pcr_plate_id |> fct_inorder()
) |>
ggplot() +
aes(
x = pcr_plate_id,
y = target |> factor() |> fct_rev(),
fill = fluor
) +
geom_tile() +
facet_grid(strain_group_qpcr ~ plate_nb, scales = "free", space = "free") +
xlab("qPCR plate ID") +
ylab("Target") +
scale_fill_manual(
name = "Fluor.",
values = c("FAM" = "dodgerblue1", "HEX" = "green3", "Cy5" = "#F8766D", "SYBR" = "green")
) +
theme(
legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0)
)Potential batch effects due to extraction should be checked for ext_lib_plate_id and those due to the qPCR itself using the pcr_plate_id.
We also add the LBP strain information to the qPCR data.
qpcr <-
qpcr |>
left_join(
SE_mg |>
rowData() |> as.data.frame() |> as_tibble() |>
filter(!is.na(LBP)) |>
select(taxa, taxon_label, LBP, strain_id, strain_origin, biose_id) |>
dplyr::rename(target = taxa),
by = join_by(target)
)qpcr <-
qpcr |>
mutate(taxon_label = taxon_label |> str_replace_na("16S rRNA gene target"))rm(SE_mg)qpcr |>
arrange(strain_group_qpcr, strain_origin) |>
mutate(target = target |> fct_inorder()) |>
ggplot() +
aes(x = target, y = strain_group_qpcr, col = strain_group_qpcr) +
geom_point() +
facet_grid(. ~ LBP + strain_origin, scales = "free", space = "free") +
xlab("Target") + ylab("VMRC group") +
guides(col = "none") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
) Observed values are the cq values.
starting_quantity are computed from the cq values using the calibration curves fitted on the standard values.
\[C = f(SQ^*)\]
where \(C\) is the observed cq value for the standard, \(SQ^*\) is the known starting quantity starting_quantity for the standards, and \(f\) is the calibration function.
I assume that, either the standards are not diluted, and that the quant_adjusted and copies_per_swabs are not correct for the standards, or that the standards are diluted and that the calibration curves are fitted on the quant_adjusted:
qpcr |>
filter(qpcr_sample_type == "Standard", target != "16S") |>
select(
sample, starting_quantity, dilution_factor, quant_adjusted, copies_per_swab
) |>
distinct() |>
arrange(sample, dilution_factor) |>
gt()| sample | starting_quantity | dilution_factor | quant_adjusted | copies_per_swab |
|---|---|---|---|---|
| std 10^1 | 1e+01 | 5 | 5e+01 | 1.25e+04 |
| std 10^1 | 1e+01 | 10 | 1e+02 | 2.50e+04 |
| std 10^1 | 1e+01 | 20 | 2e+02 | 5.00e+04 |
| std 10^1 | 1e+01 | 30 | 3e+02 | 7.50e+04 |
| std 10^2 | 1e+02 | 5 | 5e+02 | 1.25e+05 |
| std 10^2 | 1e+02 | 10 | 1e+03 | 2.50e+05 |
| std 10^2 | 1e+02 | 20 | 2e+03 | 5.00e+05 |
| std 10^2 | 1e+02 | 30 | 3e+03 | 7.50e+05 |
| std 10^3 | 1e+03 | 5 | 5e+03 | 1.25e+06 |
| std 10^3 | 1e+03 | 10 | 1e+04 | 2.50e+06 |
| std 10^3 | 1e+03 | 20 | 2e+04 | 5.00e+06 |
| std 10^3 | 1e+03 | 30 | 3e+04 | 7.50e+06 |
| std 10^4 | 1e+04 | 5 | 5e+04 | 1.25e+07 |
| std 10^4 | 1e+04 | 10 | 1e+05 | 2.50e+07 |
| std 10^4 | 1e+04 | 20 | 2e+05 | 5.00e+07 |
| std 10^4 | 1e+04 | 30 | 3e+05 | 7.50e+07 |
| std 10^5 | 1e+05 | 5 | 5e+05 | 1.25e+08 |
| std 10^5 | 1e+05 | 10 | 1e+06 | 2.50e+08 |
| std 10^5 | 1e+05 | 20 | 2e+06 | 5.00e+08 |
| std 10^5 | 1e+05 | 30 | 3e+06 | 7.50e+08 |
| std 10^6 | 1e+06 | 5 | 5e+06 | 1.25e+09 |
| std 10^6 | 1e+06 | 10 | 1e+07 | 2.50e+09 |
| std 10^6 | 1e+06 | 20 | 2e+07 | 5.00e+09 |
| std 10^6 | 1e+06 | 30 | 3e+07 | 7.50e+09 |
| std 10^7 | 1e+07 | 5 | 5e+07 | 1.25e+10 |
| std 10^7 | 1e+07 | 10 | 1e+08 | 2.50e+10 |
| std 10^7 | 1e+07 | 20 | 2e+08 | 5.00e+10 |
| std 10^7 | 1e+07 | 30 | 3e+08 | 7.50e+10 |
qpcr |>
filter(qpcr_sample_type == "Standard") |>
arrange(-dilution_factor) |>
ggplot() +
aes(
# x = starting_quantity |> log10(),
x = quant_adjusted |> log10(),
y = cq,
col = dilution_factor |> factor()
) +
geom_point(size = 1, alpha = 0.2) +
scale_color_manual(
"Dilution factor",
values = colorRampPalette(c("black", "dodgerblue1"))(4)
) +
facet_wrap(. ~ strain_group_qpcr + target, ncol = 9) For all clinical samples, the starting_quantity is computed from the cq values using the calibration curve fitted on the standard values.
\[\hat{SQ} = f^{-1}(C)\]
where \(\hat{SQ}\) is the estimated starting quantity starting_quantity for the clinical samples, \(C\) is the observed cq value for the clinical sample, and \(f^{-1}\) is the inverse of the calibration function.
If \(C\) is not observed (i.e., assumed to be larger than the total number of cycles), then \(\hat{SQ}\) is estimated as 0.
# qpcr |>
# ggplot() +
# aes(x = starting_quantity |> log10(), y = cq, col = qpcr_sample_type) +
# geom_point(size = 0.5, alpha = 0.4) +
# facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb)
qpcr |>
ggplot() +
aes(
x = starting_quantity |> log10(),
y = cq,
col = ext_lib_plate_nb |> factor(),
size = (qpcr_sample_type == "Standard")
) +
geom_point(alpha = 0.5) +
facet_wrap(strain_group_qpcr + target ~ ., ncol = 6) +
scale_size_manual("Standard", values = c(0.2, 1)) +
scale_color_viridis_d("Extraction plate number", direction = -1, option = "A", end = 0.8) It looks like \(f\) is a linear function of \(\log(SQ^*)\): \(C = \beta_0 + \beta_1 \log(SQ^*)\). This seems appropriate for the LBP strains, but for the total 16S, the standard curves are not linear. Probably not too much of an issue and unlikely to create huge batch effects.
Adjusted quantities (quant_adjusted) are then computed from the estimated starting_quantity values using the dilution factors.
\(\hat{Q} = \hat{SQ} \times d\)
where \(\hat{Q}\) is the estimated adjusted quantity (quant_adjusted), \(\hat{SQ}\) is the estimated starting quantity (starting_quantity) for the clinical samples, and \(d\) is the dilution factor.
qpcr |>
ggplot() +
aes(
x = starting_quantity |> log10(),
y = quant_adjusted |> log10(),
col = dilution_factor |> factor()
) +
geom_point() +
facet_wrap(. ~ strain_group_qpcr + target, ncol = 9) +
scale_color_manual(
"Dilution factor",
values = colorRampPalette(c("dodgerblue1", "gray90"))(4)
)quant_adjusted = starting_quantity x dilution_factor:
qpcr |>
ggplot() +
aes(
x = (starting_quantity * dilution_factor) |> log10(),
y = (quant_adjusted) |> log10(),
col = dilution_factor |> factor()
) +
geom_point() +
facet_wrap(. ~ strain_group_qpcr + target, ncol = 9) +
scale_color_manual(
"Dilution factor",
values = colorRampPalette(c("dodgerblue1", "gray90"))(4)
)copies_per_swab are the adjusted_quant multiplied by a constant factor estimated to be the number of bacteria in 1 concentration unit.
Here, that number is 250 for all LBP targets.
# strain_group_qpcr + target
qpcr |>
filter(target != "16S") |>
ggplot() +
aes(
x = (quant_adjusted * 250) |> log10(),
y = copies_per_swab |> log10(),
col = target
) +
geom_abline(intercept = 0, slope = 1, col = "gray") +
geom_point(size = 0.5, alpha = 0.4) +
facet_wrap(. ~ ext_lib_plate_nb , ncol = 6) For the 16S target, that number is 500.
qpcr |>
filter(target == "16S") |>
mutate(
r = copies_per_swab / (quant_adjusted )
) |>
pull(r) |>
hist()Dilution factors are defined per qPCR plate (and consequently, are the same for the 3 targets in the same VMRC group)
qpcr |>
dplyr::count(target, dilution_factor)
qpcr |>
group_by(pcr_plate_id, target) |>
summarize(n_dilution_factors = dilution_factor |> unique() |> length()) #|
qpcr |>
select(
starts_with("well"),
pcr_plate_id,
strain_group_qpcr,
ext_lib_plate_nb,
dilution_factor
) |>
distinct() |>
ggplot() +
aes(
x = well_row |> factor(),
y = well_col |> fct_rev(),
fill = dilution_factor |> factor()
) +
geom_tile() +
facet_grid(strain_group_qpcr ~ ext_lib_plate_nb, scales = "free", space = "free") +
scale_fill_manual(
"Dilution factor",
values = colorRampPalette(c("dodgerblue1", "gray90"))(4)
) +
xlab("Well column") + ylab("Well row") qpcr |>
select(
starts_with("well"),
pcr_plate_id,
target,
strain_group_qpcr,
ext_lib_plate_nb,
cq
) |>
distinct() |>
ggplot() +
aes(
x = well_row |> factor(),
y = well_col |> fct_rev(),
fill = cq
) +
geom_tile() +
facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free", space = "free") +
xlab("Well column") + ylab("Well row") qpcr |>
select(
starts_with("well"),
pcr_plate_id,
target,
strain_group_qpcr,
ext_lib_plate_nb,
copies_per_swab
) |>
distinct() |>
ggplot() +
aes(
x = well_row |> factor(),
y = well_col |> fct_rev(),
fill = copies_per_swab |> asinh()
) +
geom_tile() +
facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free", space = "free") +
xlab("Well column") + ylab("Well row") From these visualizations of the raw data, something looks weird with the cq and copies_per_swab values for the first 3 plates of “VMRC group 1”.
g <-
qpcr |>
ggplot() +
aes(x = target, y = copies_per_swab, col = qpcr_sample_type, fill = qpcr_sample_type) +
geom_boxplot(alpha = 0.5) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_grid(. ~ target_type, scales = "free", space = "free")
g g + scale_y_log10()g <-
qpcr |>
ggplot() +
aes(x = target, y = copies_per_swab, col = qpcr_sample_type, fill = qpcr_sample_type) +
geom_boxplot(alpha = 0.5) +
facet_grid(qpcr_sample_type ~ ., scales = "free") +
guides(fill = "none", color = "none") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0)
) +
facet_grid(. ~ target_type, scales = "free", space = "free")
g g + scale_y_log10()g <-
qpcr |>
ggplot() +
aes(x = ext_lib_plate_nb |> factor(), y = copies_per_swab, col = target, fill = target) +
geom_boxplot(alpha = 0.5) +
facet_grid(qpcr_sample_type ~ ., scales = "free") +
xlab("Extraction plate number") +
theme(
strip.text.y = element_text(angle = 0),
strip.text.x = element_text(angle = 90, hjust = 0)
)
g g + scale_y_log10() From the plots above, it looks like the empty and water samples have non-zero values on a few LBP plates.
plot_qpcr_empties <- function(qpcr, color_by = "qpcr_sample_type") {
qpcr |>
filter(qpcr_sample_type %in% c("Empty", "Water")) |>
group_by(sample, pcr_plate_id, ext_lib_plate_nb, target) |>
mutate(replicate_nb = row_number()) |>
ungroup() |>
ggplot() +
aes(x = sample, y = copies_per_swab, label = replicate_nb, col = !!sym(color_by)) +
geom_text(size = 3) +
facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free_x", space = "free_x") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0)
)
}plot_qpcr_empties(qpcr, color_by = "qpcr_sample_type") plot_qpcr_empties(qpcr |> mutate(zero_copies = (copies_per_swab == 0)), color_by = "zero_copies") prob_target <-
qpcr |>
select(target) |>
distinct() |>
mutate(prob_target = (target %in% c("C0022A1", "C0059E1", "C0175A1")))We see the same issues as above for the first 3 plates of VMRC group 1.
qpcr |>
filter(qpcr_sample_type %in% c("Control sample")) |>
group_by(sample, pcr_plate_id, ext_lib_plate_nb, target) |>
mutate(replicate_nb = row_number()) |>
ungroup() |>
mutate(
control_type = control_type |> str_replace_na("? Unknown (not in MG manifest)"),
vibr_sample_id = vibr_sample_id |> fct_reorder(control_type),
plate_nb = str_c("ext. plate\n", ext_lib_plate_nb) |> fct_reorder(ext_lib_plate_nb),
) |>
ggplot() +
aes(x = vibr_sample_id, y = copies_per_swab, label = replicate_nb, col = control_type) +
geom_text(size = 3) +
scale_y_log10() +
scale_color_discrete("Control type") +
facet_grid(strain_group_qpcr + target ~ plate_nb, scales = "free_x", space = "free_x") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
# strip.text.x = element_text(angle = 90),
strip.text.y = element_text(angle = 0)
) I am surprise to see that almost all Mock 1 samples are positive (copies per swabs >>) because, these samples were supposed to be pooled samples from the CAP084 study. I don’t think that participants of that study received the LBP. It could also be that I made wrong assumptions when matching the vibr_sample_id with the metagenomics manifest data.
The “negative controls” look better (= more consistent with expectations), except, again, for the first 3 plates of VMRC group 1.
qpcr |>
filter(qpcr_sample_type %in% "Standard") |>
ggplot() +
aes(x = starting_quantity, y = cq, col = replicate_nb |> factor()) +
geom_point(alpha = 0.5, size = 0.5) +
facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb) +
scale_x_log10() +
theme(
strip.text.y = element_text(angle = 0),
axis.text.x = element_text(angle = 45, hjust = 1)
)For the 16S, there seems to be “reverse saturation” (i.e., the cq is lower than expected with a linear relationship). Maybe the dilution was not that reliable at such low concentrations.
qpcr |>
filter(qpcr_sample_type %in% "Standard") |>
mutate(std_nb = content |> str_remove("Std-") |> parse_number()) |>
ggplot() +
aes(x = starting_quantity, y = cq, col = dilution_factor |> factor()) +
geom_path(aes(group = interaction(ext_lib_plate_nb, replicate_nb)), alpha = 0.2) +
geom_text(alpha = 0.2, aes(label = std_nb)) +
facet_wrap(strain_group_qpcr + target ~ ., ncol = 9) +
scale_x_log10() +
scale_color_manual(
name = "Dilution factor",
breaks = c(5,10,20,30),
values = c("black", "deeppink3", "deeppink1", "pink")
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)It looks like standard 4 was not prepared or placed properly for VMRC group 4.
qpcr |>
filter(qpcr_sample_type %in% "Standard") |>
mutate(std_nb = content |> str_remove("Std-") |> parse_number()) |>
ggplot() +
aes(x = starting_quantity, y = cq, col = ext_lib_plate_nb |> factor()) +
geom_path(aes(group = interaction(ext_lib_plate_nb, replicate_nb)), alpha = 0.2) +
geom_text(alpha = 0.2, aes(label = std_nb)) +
facet_wrap(strain_group_qpcr + target ~ ., ncol = 9) +
scale_x_log10() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)This shows again the issues with the first 3 plates of VMRC group 1.
qpcr |>
filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |>
group_by(vibr_sample_id, sample, pcr_plate_id, ext_lib_plate_nb, target) |>
mutate(
replicate_nb = row_number(),
median_cps = median(copies_per_swab, na.rm = TRUE),
mean_cps = mean(copies_per_swab, na.rm = TRUE)
) |>
ungroup() |>
mutate(vibr_sample_id = vibr_sample_id |> fct_reorder(mean_cps)) |>
ggplot() +
aes(x = vibr_sample_id, y = copies_per_swab, label = replicate_nb, col = replicate_nb |> factor()) +
geom_line(aes(group = vibr_sample_id), col = "black", linewidth = 0.1) +
geom_text(size = 3) +
scale_y_log10() +
scale_color_discrete("replicate") +
scale_x_discrete("Samples", breaks = NULL) +
facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free_x", space = "free_x") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0)
)We notice again the same issues with the VRMC group 1 plate 1-3.
We could simply discard these values (exclude these targets x plates) or try to identify a new threshold to differentiate between background noise and actual signal.
The idea is to use the negative controls cq values to define plate- and target-specific “detection” thresholds, then use these thresholds to set to 0 the copies_per_swab values for which the cq values are above the threshold.
The figures below show the cq values for the VMRC group 1 and 2.
qpcr |>
filter(strain_group_qpcr %in% c("VMRC_1", "VMRC_2")) |>
ggplot() +
aes(
x = well_row |> factor(),
y = well_col |> fct_rev(),
fill = cq
) +
geom_tile() +
facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free", space = "free") +
xlab("Well column") + ylab("Well row") We impute the missing cq values to 45 (well above the maximum observed value, and what I assume was the total number of cycles).
qpcr <- qpcr |> mutate(cq_imp = cq |> replace_na(45)) qpcr |>
filter(strain_group_qpcr %in% c("VMRC_1", "VMRC_2")) |>
ggplot() +
aes(
x = well_row |> factor(),
y = well_col |> fct_rev(),
fill = cq_imp
) +
geom_tile() +
facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free", space = "free") +
xlab("Well column") + ylab("Well row") cq values per sample typeqpcr <-
qpcr |>
mutate(
simplified_type =
case_when(
str_detect(control_type, "Mock") ~ "Positive control",
(qpcr_sample_type == "Control sample") & is.na(sample_type) ~ "Unknown control",
qpcr_sample_type %in% c("Control sample", "Empty", "Water") ~ "Negative control",
TRUE ~ qpcr_sample_type
) |>
fct_infreq(),
qpcr_sample_type = qpcr_sample_type |> fct_infreq()
)
# qpcr |> dplyr::count(simplified_type, sample_type, control_type, qpcr_sample_type)We define the thresholds as the average over the two smallest cq value for the “empties” (i.e., empty and water samples, as well as the negative controls).
thresholds <-
qpcr |>
filter(target != "16S") |>
group_by(target, strain_group_qpcr, ext_lib_plate_nb) |>
summarize(
min_cq_empties = min(cq_imp[simplified_type == "Negative control"], na.rm = TRUE),
robust_min_cq_empties = cq_imp[simplified_type == "Negative control"] |> sort() |> extract(1:2) |> mean(),
max_cq_std = max(cq_imp[simplified_type == "Standard"], na.rm = TRUE),
.groups = "drop"
) |>
mutate(
threshold_cq = robust_min_cq_empties
)qpcr |>
filter(
strain_group_qpcr %in% c("VMRC_1", "VMRC_2"),
ext_lib_plate_nb <= 6,
simplified_type %in% c("VIBRANT clinical sample", "Standard", "Negative control")
) |>
arrange(simplified_type) |>
ggplot() +
aes(x = simplified_type |> str_wrap(width = 15), y = cq_imp, col = qpcr_sample_type) + #
geom_violin(col = "gray") +
geom_jitter(height = 0, width = 0.25, alpha = 0.5, size = 0.75) +
geom_hline(
data = thresholds |> filter(strain_group_qpcr %in% c("VMRC_1", "VMRC_2"), ext_lib_plate_nb <= 6),
aes(yintercept = threshold_cq), col = "steelblue1"
) +
facet_grid(target ~ ext_lib_plate_nb) +
scale_color_manual(values = c("steelblue1", "black", "dodgerblue3", "red","pink") |> rev()) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
) +
xlab("Sample type") qpcr <-
qpcr |>
select(-any_of(c("min_cq_empties","robust_min_cq_empties", "max_cq_std", "threshold_cq"))) |>
left_join(thresholds, by = join_by(ext_lib_plate_nb, strain_group_qpcr, target)) For all plates and targets
qpcr |>
filter(qpcr_sample_type == "VIBRANT clinical sample") |>
ggplot() +
aes(x = cq_imp, y = target) +
geom_violin(fill = "gray90", col = "transparent") +
geom_jitter(width = 0, size = 0.1, col = "pink") +
geom_vline(aes(xintercept = min_cq_empties), col = "dodgerblue") +
geom_vline(aes(xintercept = robust_min_cq_empties), col = "green3") +
geom_vline(aes(xintercept = max_cq_std), col = "red") +
facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free") +
theme(strip.text.y = element_text(angle = 0)) +
labs(
x = "cq value (imputed to 45 when not avalaible)",
caption =
str_c(
"Pink dots are the cq values for the VIBRANT clinical samples\n",
"The blue lines are the minimum cq value for the negative controls\n",
"The green lines are the average of the two smallest cq values for the negative controls (= the detection threshold)\n",
"The red lines are the maximum cq value for the standards"
)
)We create a new column copies_per_swab_adj that is set to 0 if the cq_imp is above the threshold.
qpcr <-
qpcr |>
mutate(
copies_per_swab_adj =
case_when(
(target != "16S") & (cq_imp >= threshold_cq) ~ 0,
TRUE ~ copies_per_swab
)
)qpcr |>
filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |>
group_by(qpcr_sample_id, sample, pcr_plate_id, ext_lib_plate_nb, target) |>
mutate(
replicate_nb = row_number(),
median_cps = median(copies_per_swab_adj, na.rm = TRUE),
mean_cps = mean(copies_per_swab_adj, na.rm = TRUE)
) |>
ungroup() |>
mutate(qpcr_sample_id = qpcr_sample_id |> fct_reorder(mean_cps)) |>
ggplot() +
aes(x = qpcr_sample_id, y = copies_per_swab_adj, label = replicate_nb, col = replicate_nb |> factor()) +
geom_line(aes(group = qpcr_sample_id), col = "black", linewidth = 0.1) +
geom_text(size = 3) +
scale_y_log10() +
scale_color_discrete("replicate") +
scale_x_discrete("Samples", breaks = NULL) +
facet_grid(strain_group_qpcr + target ~ ext_lib_plate_nb, scales = "free_x", space = "free_x") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0)
)This does not fully solve the problem, but I hope that at the “aggregation” step, taking the median value would remove a lot of false positive.
If not, a more stringent threshold could potentially be used, or these samples would have to be excluded from the analysis.
qpcr <-
qpcr |>
group_by(qpcr_sample_id, target) |>
mutate(
copies_per_swab_raw_median = median(copies_per_swab, na.rm = TRUE),
copies_per_swab_adj_median = median(copies_per_swab_adj, na.rm = TRUE),
copies_per_swab_adj_range = max(copies_per_swab_adj, na.rm = TRUE) - min(copies_per_swab_adj, na.rm = TRUE),
) |>
ungroup()qpcr |>
filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |>
group_by(qpcr_sample_id) |> mutate(n_target_detected = sum(copies_per_swab_raw_median > 0)) |> ungroup() |>
mutate(qpcr_sample_id = qpcr_sample_id |> fct_reorder(n_target_detected)) |>
ggplot() +
aes(x = qpcr_sample_id, y = target, fill = copies_per_swab_raw_median |> log10()) +
geom_tile() +
facet_grid(strain_group_qpcr + LBP ~ ext_lib_plate_nb, scales = "free", space = "free") +
scale_fill_gradient(low = "dodgerblue1", high = "black", na.value = "white") +
theme(
axis.text.x = element_blank(),
legend.position = "bottom"
) +
ggtitle("Median of the original `copies_per_swabs`")qpcr |>
filter(qpcr_sample_type %in% c("VIBRANT clinical sample")) |>
group_by(qpcr_sample_id) |> mutate(n_target_detected = sum(copies_per_swab_adj_median > 0)) |> ungroup() |>
mutate(qpcr_sample_id = qpcr_sample_id |> fct_reorder(n_target_detected)) |>
ggplot() +
aes(x = qpcr_sample_id, y = target, fill = copies_per_swab_adj_median |> log10()) +
geom_tile() +
facet_grid(strain_group_qpcr + LBP ~ ext_lib_plate_nb, scales = "free", space = "free") +
scale_fill_gradient(low = "dodgerblue1", high = "black", na.value = "white") +
theme(
axis.text.x = element_blank(),
legend.position = "bottom"
) +
ggtitle("Median of the adjusted `copies_per_swabs`")The threshold correction likely helped in reducing the number of false positive.
When looking at the longitudinal patterns for randomized participants, we do not notice an over-representation of a specific plate or group in the “isolated” positives:
qpcr |>
filter(
sample_type %in% c("Clinical sample"),
sample_category %in% c("Expected sample"),
visit_code %in% c(seq(1000, 1900, by = 100), 2120)
) |>
group_by(pid) |> mutate(n_target_detected = sum(copies_per_swab_adj_median > 0)) |> ungroup() |>
mutate(pid = pid |> fct_reorder(n_target_detected)) |>
ggplot() +
aes(x = pid, y = visit_code, fill = ext_lib_plate_nb |> factor(), alpha = copies_per_swab_adj_median |> log10()) +
geom_tile() +
facet_grid(strain_group_qpcr + target ~ ., scales = "free") +
scale_fill_discrete("Extraction plate number") +
scale_alpha_continuous(limits = c(0, 10)) +
theme(
axis.text.x = element_blank(),
strip.text.y = element_text(angle = 0),
legend.position = "bottom"
)We should thus expect some false positive in the LBP qPCR data :)
qpcr |>
filter(
sample_type %in% c("Clinical sample"),
sample_category %in% c("Expected sample"),
visit_code %in% c(seq(1000, 1900, by = 100), 2120),
target == "16S"
) |>
arrange(pid, visit_code) |>
select(pid, visit_code, copies_per_swab_adj_median) |>
distinct() |>
ggplot() +
aes(x = visit_code, y = copies_per_swab_adj_median, col = pid) +
geom_boxplot(alpha = 0.2, fill = "dodgerblue1", color = "dodgerblue1") +
geom_line(aes(group = pid), alpha = 0.2) +
# geom_point(size = 0.5) +
ggbeeswarm::geom_quasirandom(width = 0.2) +
scale_y_log10() +
guides(col = "none") +
ggtitle("16S copies per swab") SummarizedExperiment objectsWe create two Summarized Experiment objects:
one with all values as provided in the original file, where each “sample” is a plate well and each feature is a target (LBP taxa)
one with summarized values (aggregated across replicates) where each “sample” is a VIBRANT sample ID (including “biological” control samples) and each feature is a target (LBP taxa)
SummarizedExperiment objectWe create a SummarizedExperiment object with the following assays:
dilution_factorcqstarting_quantitycq_meanquant_adjustedcopies_per_swab_raw (current copies_per_swab column)copies_per_swab (current copies_per_swab_adj column)The “samples” are defined by the qpcr_uid column (one per well and plate) and the “features” are defined by the target column (i.e., the LBP taxa).
The colData contains the following columns:
qpcr_uid (the qPCR specific unique identifier)uid (the VIBRANT cross-assay unique sample identifier)pid (the participant ID)visit_code (the visit code)qpcr_sample_idvibr_sample_idqpcr_sample_typesample_typecontrol_typeext_lib_plate_nbext_lib_plate_idThe rowData contains the following columns:
fluorstrain_group_qpcrpcr_plate_id (the concatenation of pcr_plate_id for each ext_lib_plate_nb)taxon_labelLBPstrain_idstrain_originbiose_idqpcr |> dplyr::count(qpcr_uid) |> dplyr::count(n)
qpcr |> dplyr::count(qpcr_uid) |> nrow()
qpcr |> dplyr::count(qpcr_uid) |> filter(n != 16) |> View()make_raw_qpcr_SE <- function(qpcr){
qpcr <-
qpcr |>
arrange(ext_lib_plate_nb, well_col, well_row) |>
mutate(qpcr_uid = qpcr_uid |> fct_inorder()) |>
arrange(strain_group_qpcr, fluor) |>
mutate(target = target |> fct_inorder()) |>
arrange(qpcr_uid, target)
# ASSAYS
dilution_assay <-
qpcr |>
select(qpcr_uid, target, dilution_factor) |>
arrange() |>
pivot_wider(names_from = qpcr_uid, values_from = dilution_factor) |>
as.data.frame() |>
column_to_rownames("target")
cq_assay <-
qpcr |>
select(qpcr_uid, target, cq) |>
arrange() |>
pivot_wider(names_from = qpcr_uid, values_from = cq) |>
as.data.frame() |>
column_to_rownames("target")
starting_quantity_assay <-
qpcr |>
select(qpcr_uid, target, starting_quantity) |>
arrange() |>
pivot_wider(names_from = qpcr_uid, values_from = starting_quantity) |>
as.data.frame() |>
column_to_rownames("target")
cq_mean_assay <-
qpcr |>
select(qpcr_uid, target, cq_mean) |>
arrange() |>
pivot_wider(names_from = qpcr_uid, values_from = cq_mean) |>
as.data.frame() |>
column_to_rownames("target")
quant_adjusted_assay <-
qpcr |>
select(qpcr_uid, target, quant_adjusted) |>
arrange() |>
pivot_wider(names_from = qpcr_uid, values_from = quant_adjusted) |>
as.data.frame() |>
column_to_rownames("target")
copies_per_swab_raw_assay <-
qpcr |>
select(qpcr_uid, target, copies_per_swab) |>
arrange() |>
pivot_wider(names_from = qpcr_uid, values_from = copies_per_swab) |>
as.data.frame() |>
column_to_rownames("target")
copies_per_swab_assay <-
qpcr |>
select(qpcr_uid, target, copies_per_swab_adj) |>
arrange() |>
pivot_wider(names_from = qpcr_uid, values_from = copies_per_swab_adj) |>
as.data.frame() |>
column_to_rownames("target")
# COLDATA
coldata <-
qpcr |>
select(
qpcr_uid,
uid, pid, visit_code, location,
qpcr_sample_type, sample_type, control_type,
qpcr_sample_id, vibr_sample_id, replicate_nb,
ext_lib_plate_nb, ext_lib_plate_id
) |>
distinct() |>
arrange(qpcr_uid) |>
as.data.frame() |>
mutate(rownames = qpcr_uid) |>
column_to_rownames("rownames")
# ROWDATA
rowdata <-
qpcr |>
select(
target, fluor, strain_group_qpcr,
pcr_plate_id, ext_lib_plate_nb,
taxon_label, LBP, strain_id, strain_origin, biose_id
) |>
distinct() |>
group_by(
target, fluor, strain_group_qpcr,
taxon_label, LBP, strain_id, strain_origin, biose_id
) |>
arrange(target, ext_lib_plate_nb) |>
mutate(
pcr_plate_id = str_c(pcr_plate_id, " (extr. plate ", ext_lib_plate_nb |> str_pad(width = 2, pad = "0"), ")", sep = ""),
) |>
summarize(
pcr_plate_ids = pcr_plate_id |> unique() |> str_c(collapse = ", "),
.groups = "drop"
) |>
ungroup() |>
as.data.frame() |>
mutate(rownames = target) |>
column_to_rownames("rownames")
SE <- SummarizedExperiment(
assays = list(
dilution = dilution_assay |> as.matrix(),
cq = cq_assay |> as.matrix(),
cq_mean = cq_mean_assay |> as.matrix(),
starting_quantity = starting_quantity_assay |> as.matrix(),
quant_adjusted = quant_adjusted_assay |> as.matrix(),
copies_per_swab_raw = copies_per_swab_raw_assay |> as.matrix(),
copies_per_swab = copies_per_swab_assay |> as.matrix()
),
colData = coldata,
rowData = rowdata,
metadata = list(
description = "Raw qPCR data from the VIBRANT study",
date = today(),
assay_and_coldata_dictionary = dictionary
)
)
SE
}SE_qPCR_raw <- make_raw_qpcr_SE(qpcr)
SE_qPCR_raw# A SummarizedExperiment-tibble abstraction: 58,032 × 31
# Features=16 | Samples=3627 | Assays=dilution, cq, cq_mean, starting_quantity,
# quant_adjusted, copies_per_swab_raw, copies_per_swab
.feature .sample dilution cq cq_mean starting_quantity quant_adjusted
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 C0175A1 std_01_plat… 10 15.3 15.2 10000000 100000000
2 C0022A1 std_01_plat… 10 16.0 16.0 10000000 100000000
3 C0059E1 std_01_plat… 10 15.4 15.4 10000000 100000000
4 UC101 std_01_plat… 10 13.9 13.9 10000000 100000000
5 FF00018 std_01_plat… 10 14.6 14.6 10000000 100000000
6 FF00051 std_01_plat… 10 14.5 14.6 10000000 100000000
7 185329 std_01_plat… 30 14.0 13.9 10000000 300000000
8 C0028A1 std_01_plat… 30 14.1 14.0 10000000 300000000
9 C0112A1 std_01_plat… 30 14.6 14.6 10000000 300000000
10 FF00004 std_01_plat… 10 14.6 14.6 10000000 100000000
# ℹ 310 more rows
# ℹ 24 more variables: copies_per_swab_raw <dbl>, copies_per_swab <dbl>,
# qpcr_uid <fct>, uid <chr>, pid <chr>, visit_code <chr>, location <chr>,
# qpcr_sample_type <fct>, sample_type <chr>, control_type <chr>,
# qpcr_sample_id <chr>, vibr_sample_id <chr>, replicate_nb <int>,
# ext_lib_plate_nb <dbl>, ext_lib_plate_id <dbl>, target <fct>, fluor <chr>,
# strain_group_qpcr <chr>, taxon_label <chr>, LBP <fct>, strain_id <fct>, …
SummarizedExperiment objectWe create a SummarizedExperiment object with summarized values (aggregated across replicates) where each “sample” is a VIBRANT sample ID (including control samples) and each feature is a target (LBP taxa), with the following assays
dilutioncopies_per_swab_med (the median copies_per_swab across replicates)copies_per_swab_mean (the mean copies_per_swab across replicates)copies_per_swab_cv (the coefficient of variation of copies_per_swab across replicates)The colData contains the same columns as above (except for those that are replicate-specific)
The rowData contains the same columns as above,
agg_qpcr_SE <- function(SE_qPCR_raw){
filtered_qpcr <-
SE_qPCR_raw |>
filter(
!(qpcr_sample_type %in% c("Standard", "Water", "Empty")),
!is.na(uid)
)
summarized_qpcr <-
filtered_qpcr |>
as_tibble() |>
group_by(uid, target) |>
summarize(
dilution = mean(dilution, na.rm = TRUE),
copies_per_swab_med = median(copies_per_swab),
copies_per_swab_mean = mean(copies_per_swab),
copies_per_swab_sd = sd(copies_per_swab),
copies_per_swab_range = max(copies_per_swab) - min(copies_per_swab),
.groups = "drop"
) |>
mutate(
copies_per_swab_cv = ifelse(copies_per_swab_sd == 0, 0, copies_per_swab_sd / copies_per_swab_mean)
)
# COLDATA
coldata <-
filtered_qpcr |>
colData() |>
as.data.frame() |>
as_tibble() |>
select(-qpcr_uid, -replicate_nb) |>
distinct() |>
arrange(uid) |>
as.data.frame() |>
mutate(rownames = uid) |>
column_to_rownames("rownames")
# ROWDATA
rowdata <-
filtered_qpcr |>
rowData() |>
as.data.frame()
SE <- SummarizedExperiment(
assays = list(
dilution =
summarized_qpcr |>
pivot_wider(id_cols = target, names_from = uid, values_from = dilution) |>
as.data.frame() |>
column_to_rownames("target") |>
as.matrix(),
copies_per_swab_med =
summarized_qpcr |>
pivot_wider(id_cols = target, names_from = uid, values_from = copies_per_swab_med) |>
as.data.frame() |>
column_to_rownames("target") |>
as.matrix(),
copies_per_swab_mean =
summarized_qpcr |>
pivot_wider(id_cols = target, names_from = uid, values_from = copies_per_swab_mean) |>
as.data.frame() |>
column_to_rownames("target") |>
as.matrix(),
copies_per_swab_cv =
summarized_qpcr |>
pivot_wider(id_cols = target, names_from = uid, values_from = copies_per_swab_cv) |>
as.data.frame() |>
column_to_rownames("target") |>
as.matrix()
),
colData = coldata,
rowData = rowdata,
metadata = list(
description = "Aggregated qPCR data from the VIBRANT study",
date = today(),
assay_and_coldata_dictionary = SE_qPCR_raw@metadata$assay_and_coldata_dictionary
)
)
SE
}SE_qPCR_agg <- agg_qpcr_SE(SE_qPCR_raw = SE_qPCR_raw)
SE_qPCR_agg# A SummarizedExperiment-tibble abstraction: 16,224 × 26
# Features=16 | Samples=1014 | Assays=dilution, copies_per_swab_med,
# copies_per_swab_mean, copies_per_swab_cv
.feature .sample dilution copies_per_swab_med copies_per_swab_mean
<chr> <chr> <dbl> <dbl> <dbl>
1 C0175A1 068100004_0000 5 0 0
2 C0022A1 068100004_0000 5 0 0
3 C0059E1 068100004_0000 5 0 0
4 UC101 068100004_0000 5 0 0
5 FF00018 068100004_0000 5 0 0
6 FF00051 068100004_0000 5 0 0
7 185329 068100004_0000 20 0 0
8 C0028A1 068100004_0000 20 0 0
9 C0112A1 068100004_0000 20 0 0
10 FF00004 068100004_0000 20 0 0
# ℹ 310 more rows
# ℹ 21 more variables: copies_per_swab_cv <dbl>, uid <chr>, pid <chr>,
# visit_code <chr>, location <chr>, qpcr_sample_type <fct>,
# sample_type <chr>, control_type <chr>, qpcr_sample_id <chr>,
# vibr_sample_id <chr>, ext_lib_plate_nb <dbl>, ext_lib_plate_id <dbl>,
# target <fct>, fluor <chr>, strain_group_qpcr <chr>, taxon_label <chr>,
# LBP <fct>, strain_id <fct>, strain_origin <fct>, biose_id <chr>, …
SummarizedExperiment objectsSave the SE objects to disk
saveRDS(
SE_qPCR_raw,
str_c(
get_output_dir(data_source = data_source),
"03_se_pcr_raw_", today() |> str_remove_all("-"), ".rds"
)
)
saveRDS(
SE_qPCR_agg,
str_c(
get_output_dir(data_source = data_source),
"03_se_pcr_agg_", today() |> str_remove_all("-"), ".rds"
)
)